readr::read_csv(here("data/character_list5.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_character_name = col_character(),
words = col_integer(),
gender = col_character(),
age = col_character()
)) %>%
mutate(age = as.numeric(age)) -> characters_list
characters_list %>%
glimpse()
Observations: 23,048
Variables: 5
$ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 623, 623, 623, 623, 623, 623...
$ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "francesca johns", "madge", "...
$ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, 409, 347, 2020, 366, 160, ...
$ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f", "m", "m", "m", "m", "m"...
$ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58, 53, 25, 39, 33, NA, 34, ...
readr::read_csv(here("data/meta_data7.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_id = col_character(),
title = col_character(),
year = col_integer(),
gross = col_integer(),
lines_data = col_character()
)) %>%
mutate(title = iconv(title,"latin1", "UTF-8")) -> meta_data
meta_data %>%
glimpse()
Observations: 2,000
Variables: 6
$ script_id <int> 1534, 1512, 1514, 1517, 1520, 6537, 3778, 623, 1525, 6030, 625, 1509, 8543, 7...
$ imdb_id <chr> "tt1022603", "tt0147800", "tt0417385", "tt2024544", "tt1542344", "tt0450385",...
$ title <chr> "(500) Days of Summer", "10 Things I Hate About You", "12 and Holding", "12 Y...
$ year <int> 2009, 1999, 2005, 2013, 2010, 2007, 1992, 2001, 2009, 2013, 1968, 2009, 2008,...
$ gross <int> 37, 65, NA, 60, 20, 91, 15, 37, 74, 80, 376, 192, 98, 204, 19, 59, 67, 36, 32...
$ lines_data <chr> "7435445256774774443342577775657744434444564456745433675534527777342375445534...
scripts_data %>%
glimpse()
Observations: 18,968
Variables: 16
$ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 623, 623, 623, 623, 623, 623...
$ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "francesca johns", "madge", "...
$ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, 409, 347, 2020, 366, 160, ...
$ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f", "m", "m", "m", "m", "m"...
$ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58, 53, 25, 39, 33, NA, 34, ...
$ imdb_id <chr> "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt...
$ title <chr> "The Bridges of Madison County", "The Bridges of Madison County", "T...
$ year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 2001, 2001, 2001, 2001, 20...
$ gross <int> 142, 142, 142, 142, 142, 142, 142, 37, 37, 37, 37, 37, 37, 37, 37, 3...
$ lines_data <chr> "4332023434343443203433434334433434343434434344344333434443444344233...
$ fem_words <dbl> 311, 873, 138, 2251, 190, 0, 0, 0, 409, 0, 0, 0, 0, 0, 0, 148, 801, ...
$ man_words <dbl> 0, 0, 0, 0, 0, 723, 1908, 328, 0, 347, 2020, 366, 160, 1337, 1683, 0...
$ total_fem_words <dbl> 3763, 3763, 3763, 3763, 3763, 3763, 3763, 1524, 1524, 1524, 1524, 15...
$ total_man_words <dbl> 2631, 2631, 2631, 2631, 2631, 2631, 2631, 7584, 7584, 7584, 7584, 75...
$ f_m_ratio <dbl> 2.5000000, 2.5000000, 2.5000000, 2.5000000, 2.5000000, 2.5000000, 2....
$ f_m_wordratio <dbl> 1.43025466, 1.43025466, 1.43025466, 1.43025466, 1.43025466, 1.430254...
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
filter(f_m_wordratio < 10) %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_wordratio)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=f_m_ratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black") +
scale_x_continuous(breaks = seq(0,10,0.5))
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_ratio)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=year)) +
geom_bar(fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=year)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=gross,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 50,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=gross)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title) %>%
slice(1) %>%
unique() %>%
ungroup() %>%
select(title,
gross,
f_m_ratio,
f_m_wordratio) -> data
select(data, -title) %>%
mutate_all(funs(scale)) -> scaled_data
scaled_data %>%
sample_n(10)
A estatística GAP compara a solução do agrupamento com cada k com a solução em um dataset onde não há estrutura de grupos.
plot_clusgap = function(clusgap, title="Gap Statistic calculation results"){
require("ggplot2")
gstab = data.frame(clusgap$Tab, k=1:nrow(clusgap$Tab))
p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size=5)
p = p + geom_errorbar(aes(ymax=gap+SE.sim, ymin=gap-SE.sim), width = .2)
p = p + ggtitle(title)
return(p)
}
gaps <- scaled_data %>%
clusGap(FUN = kmeans,
nstart = 20,
K.max = 8,
B = 200,
iter.max=30)
Clustering k = 1,2,..., K.max (= 8): .. done
Bootstrapping, b = 1,2,..., B (= 200) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
plot_clusgap(gaps)
set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(scaled_data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=5, method = "kmeans",
index = "all", alphaBeale = 0.1)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 5 proposed 2 as the best number of clusters
* 8 proposed 3 as the best number of clusters
* 2 proposed 4 as the best number of clusters
* 8 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
n_clusters = 3
scaled_data %>%
kmeans(n_clusters, iter.max = 100, nstart = 20) -> km
p <- autoplot(km, data=scaled_data, frame = TRUE)
ggplotly(p)
row.names(scaled_data) <- data$title
toclust <- scaled_data %>%
rownames_to_column(var = "title")
km = toclust %>%
select(-title) %>%
kmeans(centers = n_clusters, iter.max = 100, nstart = 20)
km %>%
augment(toclust) %>%
gather(key = "variável", value = "valor", -title, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster) +
coord_flip()
dists = scaled_data %>%
dist()
scaled_data %>%
kmeans(3, iter.max = 100, nstart = 20) -> km
silhouette(km$cluster, dists) %>%
plot(col = RColorBrewer::brewer.pal(4, "Set2"),border=NA)